Text analysis: title and abstract of male and female speakers
Abstracts
data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date)
#skimr::skim(data)Excluding special events as round tables and discussions not related to a project or study presented by someone.
IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs)Using abstracts in English (original or translated)
data <- data %>% filter(!is.na(abstract_english)) Number of abstracts per group
table(data$gender)##
## F M
## 101 139
table(data$position_cat,data$gender)##
## F M
## others 4 1
## postdoc 21 21
## professor 21 60
## student 53 56
Tidytext
text_tok <- data %>% dplyr::select(id,gender,position_cat, audience_n,
abstract_english, title_english) %>%
mutate(text = paste(title_english, abstract_english)) %>%
unnest_tokens(output=word,input=text)
stop_w <- tibble(word = stopwords("en"))
# remove stopwords
text <- text_tok %>%
anti_join(stop_w, by="word") %>% arrange(word)
# remove numbers and other characters
text <- text %>% slice(-c(1:290)) %>% # number and some symbols
filter(nchar(word)!=1) %>% # letters alone
filter(!word %in% c("mpas", "ÎŽ13c", "ÎČ") )# remove acronyms, symbols
# solving some simple plurals
plural <- c("actions","advances", "adaptations", "amphibians", "animals", "ants","anurans",
"applications","approaches", "bees","builds", "birds",
"cerrados","challenges",
"continents","crops",
"decisions","declines","determines","determinants", "defenses",
"dynamics",
"economics", "ecosystems","environments", "experiences",
"forests",
"genetics","gifts","gradients","guides","impacts",
"increases","interactions","lives",
"landscapes","males","mammals", "mangroves","models","movements",
"mutualisms","networks","neotropics",
"opilions","phenotypes","plants","projects","paths", "perspectives",
"populations","promotes","relationships", "relations",
"resources","responses","roads","services","skulls","snakes","seeds",
"spaces", "spiders","stages", "trees", "variations",
"threats")
text$word[text$word %in% plural] <-
substr(text$word[text$word %in% plural],
1,nchar(text$word[text$word %in% plural])-1)- Grouping similar words:
lemma <- rbind(c("adaptive", "adaptation"),
c("advancement", "advance"),
c("agricultural", "agriculture"),
c("agro", "agriculture" ),
c("amazonia","amazon" ),
c("amazonian","amazon" ),
c("andean","andes"),
c("apply","application"),
c("applying","application"),
c("apidae","apis"),
c("arachnida","arachnid"),
c("argue","argument"),
c("basal", "basis"),
c("behavioral","behavior"),
c("behavioural","behavior"),
c("bignonieae", "bignoniaceae"),
c("biological", "biology"),
c("brazilian","brazil"),
c("building","build"),
c("changing", "change"),
c("cnidarian", "cnidaria"),
c("coastal","coast"),
c("colour", "color"),
c("colors", "color"),
c("communities","community" ),
c("competitive", "competition"),
c("complexity", "complex"),
c("convergences", "convergence"),
c("convergent", "convergence"),
c("cordatus","cordata" ),
c("croplands","crop"),
c( "cultural", "culture"),
c("darwin's", "darwin"),
c("darwinian", "darwin"),
c("defensive", "defense"),
c("dependent","dependence"),
c("detecting","detection"),
c("determine", "determinant"),
c("developmental", "development"),
c("dispersers","dispersal"),
c("disturbed", "disturbance"),
c("diversification", "diversity"),
c("dragonflies", "dragonfly"),
c("drier", "drought"),
c("ecological", "ecology"),
c("ecologists", "ecology"),
c("endemic", "endemism"),
c("effectiveness", "efficiency"),
c("environmental", "environment"),
c("evolutionary", "evolution"),
c("expanding", "expansion"),
c("extinct", "extinction"),
c("facilitate", "facilitation"),
c("fisheries", "fishery"),
c("floral", "flora"),
c("floristic", "flora"),
c("forested", "forest"),
c("functional", "function"),
c("functionally", "function"),
c("functioning", "function"),
c("geographical", "geographic"),
c("heterogeneties", "heterogeneity"),
c("heterogeneous", "heterogeneity"),
c("histories", "history"),
c("integrated", "integration"),
c("intregating", "integration"),
c("integrative", "integration"),
c("invasive", "invasion"),
c("isotopic", "isotope"),
c("linking", "link"),
c("living", "live"),
c("mammalia", "mammal"),
c("managed", "manage"),
c("managers", "manage"),
c("mathematical", "mathematics"),
c("mates", "mating"),
c("mediated", "mediate"),
c("mechanistic", "mechanism"),
c("matrices", "matrix"),
c("migratory", "migration"),
c("mimicking", "mimicry"),
c("modeling", "model"),
c("mutualistic", "mutualism"),
c("natural", "nature"),
c("neotropical", "neotropic"),
c("northeastern", "northeast"),
c("occuring", "occur"),
c("onça", "onca"),
c("opiliones", "opilion"),
c("parasite", "parasitism"),
c("parent", "parenting"),
c("phylogenies", "phylogeny"),
c("phylogenetic", "phylogeny"),
c("phylogenomic", "phylogeny"),
c("pollinators", "pollination"),
c("protected", "protect"),
c("protective", "protect"),
c("rainfall", "rain"),
c("reconstructing", "reconstruction"),
c("regulatory", "regulation"),
c("regulates", "regulation"),
c("relation", "relationship"),
c("reproductive", "reproduction"),
c("restored", "restoration"),
c("robustness", "robust"),
c("scientific", "science"),
c("scientist", "science"),
c("sexy", "sexual"),
c("simulated", "simulation"),
c("societies", "society"),
c("social", "society"),
c("socio", "society"),
c("space", "spatial"),
c("spacio", "spatial"),
c("stabilize", "stability"),
c("stable", "stability"),
c("stories", "story"),
c("strategic", "strategy"),
c("strategies", "strategy"),
c("structured", "structure"),
c("structuring", "structure"),
c("studies", "study"),
c("studing", "study"),
c("sustainable", "sustainability"),
c("theories", "theory"),
c("theoretical", "theory"),
c("threatened", "threat"),
c("tropical", "tropic"),
c("vision", "visual")
)
lemma <- as.data.frame(lemma)
for (i in 1:dim(lemma)[1]){
text$word[text$word == lemma[i,1]] <- lemma[i,2]
}WORDS - aLl data
table(text$gender)##
## F M
## 10812 13614
table(text$position_cat ,text$gender)##
## F M
## others 262 139
## postdoc 2793 2494
## professor 2062 5370
## student 5524 5531
Mean number of words by abstract
text %>% count(id,gender) %>%
ggplot(aes(x=gender, y=n)) +
geom_violin() + geom_boxplot(width=0.2)+
ggbeeswarm::geom_quasirandom(size=3, shape=21) 20 palavra mais comuns
text %>%
count(word, sort = TRUE) %>%
top_n(20,n)%>%
kable()| word | n |
|---|---|
| species | 384 |
| ecology | 188 |
| forest | 175 |
| model | 157 |
| study | 156 |
| environment | 137 |
| evolution | 134 |
| can | 132 |
| landscape | 127 |
| population | 124 |
| diversity | 112 |
| nature | 102 |
| community | 100 |
| male | 97 |
| plant | 97 |
| different | 95 |
| patterns | 88 |
| present | 88 |
| areas | 84 |
| interaction | 82 |
Word cloud
textplot_wordcloud(x=dfm(tokens(text$word)))par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
col="#FCA532")Word frequencies by gender
props <- text %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]ggplot(props, aes(x=proportion_M,, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
#geom_point(size=2.5, alpha=0.5)+
geom_jitter(size=2.5, alpha=0.2)+
geom_text_repel(aes(label=label), size=3.2)+
scale_x_log10(name="Male most used words",
labels = percent_format()) +
scale_y_log10(name="Female most used words",
labels = percent_format()) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0)) # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq.jpg", height = 5, width=7)Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.
Correlation of word frequeency use between gender:
cor.test(props$proportion_F, props$proportion_M)##
## Pearson's product-moment correlation
##
## data: props$proportion_F and props$proportion_M
## t = 70.789, df = 1677, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8530819 0.8771137
## sample estimates:
## cor
## 0.8655954
Highly correlated -> it means they tend to use the same frequency of main word
20 words with the largest differences in frequency
prop2 <- props %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
labels = c(0.02,0.01,0,0.01,0.02))ggsave("figures/abstract_wordFreq_barplot.jpeg", units="in", width=7, height=7, dpi=300)TF IDF
text_id <- text %>% count(gender, word) %>%
bind_tf_idf(word, gender, n) %>%
arrange(desc(tf_idf))10 âexclusiveâ words for each group
text_id$word <- as.factor(text_id$word)
text_id %>%
group_by(gender) %>%
arrange(desc(tf_idf)) %>%
top_n(10, tf_idf) %>%
ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, scales = "free") +
theme_minimal()WORDS professors only data
textP <- text %>% filter(position_cat == "professor")
table(textP$gender)##
## F M
## 2062 5370
Mean number of words by abstract
textP %>% count(id,gender) %>%
ggplot(aes(x=gender, y=n)) +
geom_violin() + geom_boxplot(width=0.2)+
ggbeeswarm::geom_quasirandom(size=3, shape=21) 20 palavra mais comuns
textP %>%
count(word, sort = TRUE) %>%
top_n(20,n)%>%
kable()| word | n |
|---|---|
| species | 88 |
| ecology | 67 |
| evolution | 52 |
| population | 52 |
| environment | 50 |
| nature | 44 |
| plant | 43 |
| model | 41 |
| study | 41 |
| can | 39 |
| ecosystem | 38 |
| diversity | 35 |
| society | 32 |
| water | 32 |
| pollination | 30 |
| research | 30 |
| interaction | 29 |
| science | 29 |
| biology | 28 |
| present | 26 |
Counting words Frequency by gender
propsP <- textP %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]ggplot(propsP, aes(x=proportion_M, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
# geom_point(size=2.5, alpha=0.3) +
geom_jitter(size=2.5, alpha=0.3)+
geom_text_repel(aes(label=label), size=3)+
scale_x_log10(name="Male most used words", limits=c(0.0003,0.02),
labels = percent_format()) +
scale_y_log10(name="Female Most used words", limits=c(0.0003,0.02),
labels = percent_format()) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0)) # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq_Prof.jpg", height = 5, width=7)Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females.
Labels for the 20 words with largest differences in frequency.
Correlation of word frequeency use between gender:
cor.test(propsP$proportion_F, propsP$proportion_M)##
## Pearson's product-moment correlation
##
## data: propsP$proportion_F and propsP$proportion_M
## t = 20.607, df = 559, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6072948 0.7016627
## sample estimates:
## cor
## 0.6570452
20 words with the largest differences in frequency
propP2 <- propsP %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.03,-0.02,-0.01,0,0.01,0.02,0.03),
labels = c(0.03,0.02,-0.01,0,0.01,0.02,0.03))ggsave("figures/abstract_wordFreq_barplot_Prof.jpeg", units="in", width=7, height=7, dpi=300)Topic model - all data
matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)3 models with different number of topics, comparing AIC - cross validation is better? loo
ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)## AIC dAIC df
## ap_lda2 377783.9 0.0 9919
## ap_lda3 379803.9 2020.0 14878
## ap_lda4 383258.3 5474.4 19837
Word-topic probabilities
10 words with the largest probabilities for each group
ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()Document-topic probabilities - classifying the abstracts
ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifi$gender, classifi$topic)##
## 1 2
## F 55 46
## M 62 76
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>%
adorn_pct_formatting(digits = 0) ## gender 1 2
## F 54% 46%
## M 45% 55%
classifi %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ gender)Topic model Professors only
matextP <- textP %>%
count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)ap_lda2 <- LDA(matextP, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matextP, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matextP, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,base=T)## AIC dAIC df
## ap_lda2 113448.6 0.0 5203
## ap_lda3 115229.2 1780.6 7804
## ap_lda4 117829.0 4380.4 10405
word-topic probabilities
ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()Document-topic probabilities
ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifi$gender, classifi$topic)##
## 1 2
## F 11 10
## M 36 23
library(janitor)
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>%
adorn_pct_formatting(digits = 0) ## gender 1 2
## F 52% 48%
## M 61% 39%
classifi %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
geom_violin()+
facet_wrap(~ gender)Sentiment analysis
Chapter 2, Silge & RObinson. 2018
- The NRC lexicon categorizes words in a binary fashion (âyesâ/ânoâ) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.
get_sentiments("nrc")## # A tibble: 13,875 Ă 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ⊠with 13,865 more rows
- The Bing lexicon categorizes words in a binary fashion into positive and negative categories.
get_sentiments("bing")## # A tibble: 6,786 Ă 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ⊠with 6,776 more rows
- The AFINN lexicon assigns words with a score that runs between -5 and 5, with negâ ative scores indicating negative sentiment and positive scores indicating positive senâ timent.
get_sentiments("afinn")## # A tibble: 2,477 Ă 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ⊠with 2,467 more rows
PENSAR: tem que levar em conta nĂșmero de palavras diferentes entre abstracts - principalmente se ouver diferença mĂ©dia de nĂșmero de palavras por abstract de homens e mulehres nĂ©? ou nĂŁo?
Score words difference in female and male abstracts
All data
affword <- get_sentiments("afinn")
affc <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(affword, "word")Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:
affc %>% group_by(id, gender) %>%
summarise(mean.score = mean(value),
weig.score = weighted.mean(value,n)) %>%
ggplot(aes(x=gender,y=weig.score)) +
geom_violin() +
geom_boxplot(width=0.1) +
geom_quasirandom()+
ggtitle("Mean words score per abstract and gender")Professors
affword <- get_sentiments("afinn")
affc <- textP %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(affword, "word")Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:
affc %>% group_by(id, gender) %>%
summarise(mean.score = mean(value),
weig.score = weighted.mean(value,n)) %>%
ggplot(aes(x=gender,y=weig.score)) +
geom_violin() +
geom_boxplot(width=0.1) +
geom_quasirandom()+
ggtitle("Mean words score per abstract and gender")Frequency of sentiment words per abstract
As classificaçÔes das palavras nĂŁo me parecem muito acuradas com a linguagem cientĂfica.
Precisa saber como ponderar pelo total de palavras.
All data
nrcword <- get_sentiments("nrc")
nrc <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(nrcword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(nrc, aes(x=gender, y=n)) +
facet_wrap(~sentiment) +
geom_violin() +
geom_quasirandom()Professors
nrcword <- get_sentiments("nrc")
nrc <- textP %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(nrcword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(nrc, aes(x=gender, y=n)) +
facet_wrap(~sentiment) +
geom_violin()+
geom_quasirandom()nrc %>% filter(sentiment == "positive") %>%
ggplot( aes(x=gender, y=n)) +
geom_violin() +
geom_boxplot(width=0.2) +
geom_quasirandom()+
ggtitle("Positive words")Frequency of sentiment words per abstract
All data
bingword <- get_sentiments("bing")
bing <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(bingword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(bing, aes(x=sentiment, y=n)) +
facet_wrap(~gender) +
geom_violin()+
geom_quasirandom()Professors
bingword <- get_sentiments("bing")
bing <- textP %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(bingword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(bing, aes(x=sentiment, y=n)) +
facet_wrap(~gender) +
geom_violin() +
geom_boxplot(width=0.2) +
geom_quasirandom()